1. Male and female graduate enrollment (1967 - 2015). Compare trends in total graduate enrollment for males and females (including full-time/part-time and private/public universities) in the United States from 1967 - 2015. Describe your results statistically, graphically and in text. x

reminder: cannot split public_all/private_All by gender does not equate to gender column

# Exploratory line graphs

###########################This is what the assingment asks for##############################################
# total enrollment males vs females
total_enrol <- grad2 %>% 
  ggplot(aes(x = Year, y = Total)) +
  geom_line(aes(color = Gender)) +
  theme_classic() +
  labs(
    x = "Year",
    y = "Enrollment in Post-Secondary Institutions"
  ) +
  scale_x_continuous(expand=c(0,0))

total_enrol

##################################################################################

# Below are extra graphs if we want
# full time enrollment men vs women
full_time <- grad2 %>% 
  ggplot(aes(x = Year, y = Full_time)) +
  geom_line(aes(color = Gender)) +
  theme_classic() +
  labs(
    x = "Year",
    y = "Full Time Enrollment in Post-Secondary Institutions"
  ) +
  scale_x_continuous(expand=c(0,0))

full_time

# part time enrollment men vs women 

part_time <- grad2 %>% 
  ggplot(aes(x = Year, y = Part_time)) +
  geom_line(aes(color = Gender)) +
  theme_classic() +
  labs(
    x = "Year",
    y = "Part Time Enrollment in Post-Secondary Institutions"
  ) +
  scale_x_continuous(expand=c(0,0))

part_time

Continuation of 1: Statistical tests Need to compare enrollment over time of men vs women

# Normality tests

Ftotal_hist <- grad %>% 
  ggplot(aes(x = Total_F)) +
  geom_histogram()

Ftotal_hist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ftotal_qq <- grad %>% 
  ggplot(aes(sample = Total_F)) +
  geom_qq()

Ftotal_qq

Mtotal_hist <- grad %>% 
  ggplot(aes(x = Total_M)) +
  geom_histogram()

Mtotal_hist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Mtotal_qq <- grad %>% 
  ggplot(aes(sample = Total_M)) +
  geom_qq()

Mtotal_qq

Linear Regressions part 1

####### Linear Regression #####################
enroll_lm1 <- lm(Total_M ~ Year, data = grad)
# Male Salary = -17112153 + 9069(Year)
enroll_lm2 <- lm(Total_F ~ Year, data = grad)
# Female salary = -58955502 + 30126(Year)



# check the model diagnostics
par(mfrow = c(2,2))
  plot(enroll_lm1)

par(mfrow = c(2,2))
  plot(enroll_lm2)

# check model fit
  summary(enroll_lm1) 
## 
## Call:
## lm(formula = Total_M ~ Year, data = grad)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96461 -34861 -12841  51876  95766 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17112153    1087024  -15.74   <2e-16 ***
## Year             9069        546   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
 # overall model fit p < 0.001
  
  
####### Regression graphs ######
  
male_enroll_graph <- ggplot(grad, aes(x = Year, y = Total_M)) + 
  geom_point() +
  geom_smooth(method = lm, se = TRUE, size = 0.5, color = "gray20") + theme_bw() +
  scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
  labs(x = "Year", y = "Male Enrollment")
male_enroll_graph

female_enroll_graph <- ggplot(grad, aes(x = Year, y = Total_F)) + 
  geom_point() +
  geom_smooth(method = lm, se = TRUE, size = 0.5, color = "gray20") + theme_bw() +
  scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
  labs(x = "Year", y = "Female Enrollment")
female_enroll_graph

#### Both male and female enrollment linear regressions  ############
all_enroll_graph <- ggplot(grad, aes(x = Year, y = Total_M)) + 
  geom_point(color = "steelblue4") +
  geom_smooth(method = lm, se = TRUE, size = 0.5, color = "gray20", fill = "steelblue3") + theme_bw() +
  scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
  geom_point(data = grad, aes(x =  Year, y = Total_F), color = "indianred2") +
  geom_smooth(data = grad, aes(x =  Year, y = Total_F), method = lm, se = TRUE, size = 0.5, color = "gray20", fill =   "lightcoral") +   
  theme_bw() +
  labs(x = "Year", y = "Enrollment")
all_enroll_graph

Stargazer

MF_table <- stargazer(enroll_lm1, enroll_lm2, type="html")
Dependent variable:
Total_M Total_F
(1) (2)
Year 9,069.301*** 30,126.030***
(545.955) (583.175)
Constant -17,112,153.000*** -58,955,502.000***
(1,087,024.000) (1,161,132.000)
Observations 49 49
R2 0.854 0.983
Adjusted R2 0.851 0.982
Residual Std. Error (df = 47) 54,046.810 57,731.420
F Statistic (df = 1; 47) 275.952*** 2,668.611***
Note: p<0.1; p<0.05; p<0.01
MF_table
[1] “”
[2] “ "
[3] “ "
[4] “ "
[5] “ "
[6] “ "
[7] “ "
[8] “ "
[9] “ "
[10] “ "
[11] “ "
[12] “ "
[13] “ "
[14] “ "
[15] “ "
[16] “ "
[17] “ " [18] “
Dependent variable:
Total_M Total_F
(1) (2)
Year 9,069.301*** 30,126.030***
(545.955) (583.175)
Constant -17,112,153.000*** -58,955,502.000***
(1,087,024.000) (1,161,132.000)
Observations 49 49
R2 0.854 0.983
Adjusted R2 0.851 0.982
Residual Std. Error (df = 47) 54,046.810 57,731.420
F Statistic (df = 1; 47) 275.952*** 2,668.611***
Note: p<0.1; p<0.05; p<0.01

"

  1. Shifts in female PhD recipients by field (1985, 2000, and 2015). Describe if and how there was a shift in PhDs awarded to females in four fields (Physical and Earth Sciences, Engineering, Education, and Humanities & Arts) in 1985, 2000, and 2015. Describe your results statistically, in a graph or table, and in text. Note: There are several ways that you can interpret this question. You are invited to decide which you think is/are most interesting. Just be really clear about what you are asking/answering in your report.
# exploratory column graph

phd_shifts <- phd_tidy %>% 
  filter(Field == "Physical science" | Field == "Engineering" | Field == "Education" | Field == "Humanities", Sex    == "F") %>% 
  ggplot(aes(x = Year, y = Number)) +
  geom_col(aes(fill = Field), position = "dodge") +
  scale_x_continuous(expand=c(0,0), breaks = seq(1985,2015, by=15)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_classic() +
  labs(
    x = "Year",
    y = "Female PhDs Awarded"
  )+
  scale_fill_manual(breaks = c("Education", "Engineering", "Humanities", "Physical science"), values = c("chartreuse4","dodgerblue3","darkgoldenrod2","tomato1"))

phd_shifts

#stacked graph 

phd_shifts_stacked <- phd_tidy %>% 
  filter(Field == "Physical science" | Field == "Engineering" | Field == "Education" | Field == "Humanities", Sex    == "F") %>% 
  ggplot(aes(x = Year, y = Number , fill = Field))+
  geom_bar(stat = "identity") +
  scale_x_continuous(expand=c(0,0), breaks = seq(1985,2015, by=15)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_classic() +
  labs(
    x = "Year",
    y = "Female PhDs Awarded"
  )+
  scale_fill_manual(breaks = c("Education", "Engineering", "Humanities", "Physical science"), values = c("chartreuse4","dodgerblue3","darkgoldenrod2","tomato1"))

phd_shifts_stacked

#chi squared test on female "is there a sigificant difference in females receiving doctoral degrees in each field in 1985, 2000 and 2015?"

#Ho - there is no association between year and number of women in each field 
#Ha - there IS an association between year and number of women in each field 

phd_chi2 <- phd_chi %>% 
  select(-Year) %>%
  filter(physical != "NA")

row.names(phd_chi2) <- c("1985","2000","2015")
## Warning: Setting row names on a tibble is deprecated.
#Another table with counts: 
  

phd_chi3 <- phd_chi2 %>% 
  mutate(rowSums(phd_chi2))

#Perform chi-square 

phd_x2test <- chisq.test(phd_chi2) 

phd_x2test
## 
##  Pearson's Chi-squared test
## 
## data:  phd_chi2
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
#we reject the null hypothesis beacuse the p-value is < .001. The field in which women receive their doctoral degrees differs significantly between 1985, 2000 and 2015. There is a significant association between year and number of docotoral degrees awarded to women in each field. 


#next: make a table

phdfield_prop <- prop.table(as.matrix(phd_chi2), 1) %>% 
  round(2)

phdfield_prop
##      physical engineering education humanities
## 1985     0.10        0.04      0.62       0.25
## 2000     0.12        0.10      0.48       0.31
## 2015     0.19        0.22      0.33       0.27
phdfield_prop_table <- kable(phdfield_prop, col.names = c("Physical and Earth Sciences", "Engineering", "Education", "Humanities"), align = "c") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F) %>% 
  column_spec(1, bold = T)


phdfield_prop_table
Physical and Earth Sciences Engineering Education Humanities
1985 0.10 0.04 0.62 0.25
2000 0.12 0.10 0.48 0.31
2015 0.19 0.22 0.33 0.27
  1. Male and female salaries for starting postdoctoral and other employment positions (2015). Compare median salaries for male and female doctorate recipients in 2015. Answer these two questions: Does median salary differ significantly between male and female starting postdoc positions? Does median salary differ significantly between male and female PhD recipients in non-postdoc employment positions?
# Comparing medians use Wilcoxon signed-rank for paired data

################## Compare median salaries for non-postdoc positions ################

# Mann-Whitney U
emp_mwu <- wilcox.test(median$`Emp M`, median$`Emp F`, paired = TRUE)
## Warning in wilcox.test.default(median$`Emp M`, median$`Emp F`, paired =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(median$`Emp M`, median$`Emp F`, paired =
## TRUE): cannot compute exact p-value with zeroes
emp_mwu
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  median$`Emp M` and median$`Emp F`
## V = 101, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
# V =  101 p-value = 0.002572 (SIGNIFICANT)

# Cliffs Delta (effect size)
emp_cliff <- cliff.delta(median$`Emp M`, median$`Emp F`)
emp_cliff
## 
## Cliff's Delta
## 
## delta estimate: 0.2133333 (small)
## 95 percent confidence interval:
##        inf        sup 
## -0.2155378  0.5732121
# Cliff delta = 0.2133 (small)


################### compare median salaries for postdoc positions ###############

# Explotory histogram - male median salaries 
# bins
k <- 2*((NROW(na.omit(median$`PS M`)))^(1/3))

psmale_hist <- ggplot(median, aes(x = `PS M`)) +
  geom_histogram(bins = k) +
  theme_classic() +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous (expand = c(0,0))
psmale_hist

# normal - more evenly spread 

# Exploratory QQ - male median salary 

psmale_qq <- ggplot(median, aes(sample = `PS M`)) +
  geom_qq() 
psmale_qq

# overall normal, a few more abnormal points between theoretical 0 and 1 

# histogram - female median salary
#bins
k <- 2*((NROW(na.omit(median$`PS F`)))^(1/3))

psfemale_hist <- ggplot(median, aes(x = `PS F`)) +
  geom_histogram(bins = k) +
  theme_classic() +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous (expand = c(0,0))
psfemale_hist

# relatively normal  

# Exploratory QQ - female median salary 

psfemale_qq <- ggplot(median, aes(sample = `PS F`)) +
  geom_qq() 
psfemale_qq

# overall relatively normal

# Wilcoxon signed-rank

# HO: Ranks are equal (medians are equal)
# HA: Ranks are NOT equal

ps_mwu <- wilcox.test(median$`PS M`, median$`PS F`, paired = TRUE)
## Warning in wilcox.test.default(median$`PS M`, median$`PS F`, paired =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(median$`PS M`, median$`PS F`, paired =
## TRUE): cannot compute exact p-value with zeroes
ps_mwu
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  median$`PS M` and median$`PS F`
## V = 19.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
# V = 19.5 p-value = 0.8884 (NOT significant)

# Cliffs Delta (effect size)

ps_cliff <- cliff.delta(median$`PS M`, median$`PS F`)
ps_cliff
## 
## Cliff's Delta
## 
## delta estimate: 0.04 (negligible)
## 95 percent confidence interval:
##        inf        sup 
## -0.3710946  0.4379849
# Cliff delta = 0.04 (negligible)


############################# Box & Whisker plot Emp #################################
emp_median <- median_box %>% 
  filter(type == "emp")

empmedian_box <- ggplot(emp_median, aes(x = sex, y = salary)) +
  geom_boxplot(aes(color = sex))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),text = element_text(family = "Arial"))+
  xlab("Sex")+
  ylab("Median Salary") +
  scale_x_discrete(breaks = c("female","male"), labels = c("Female", "Male")) +
  scale_color_manual(breaks = c("Female", "Male"), values = c("lightcoral", "steelblue3"))
  
empmedian_box

########################### Box & Whisker plot Postdoc ###############################
ps_median <- median_box %>% 
  filter(type == "ps")

psmedian_box <- ggplot(ps_median, aes(x = sex, y = salary)) +
  geom_boxplot(aes(color = sex))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),text = element_text(family = "Arial"))+
  xlab("Sex")+
  ylab("Median Salary") +
  scale_x_discrete(breaks = c("female","male"), labels = c("Female", "Male")) +
  scale_color_manual(breaks = c("Female", "Male"), values = c("lightcoral", "steelblue3"))
  
psmedian_box

######################### bar graph non-postdoc employment ###############
emp_median_bar <- median_tidy %>% 
  filter (type == "emp") 

emp_col <- ggplot(emp_median_bar, aes(x = field, y = salary, fill = sex)) +
  geom_col(aes(fill = sex), position = "dodge") +
  scale_y_continuous(expand = c(0,0), limits = c(0,125000)) + 
  labs(y = "Salary (USD)", x = "Field of Study") +
  theme_classic() + 
  scale_fill_manual(name = "Sex",
                    breaks = c("f", "m"),
                    values = c("lightcoral", "steelblue3"),
                    labels = c("Female", "Male")) +
  theme(axis.text.x = element_text(angle = 66, hjust = 1)) 
emp_col

######################## bar graph postdoc employment ##################
ps_median_bar <- median_tidy %>% 
  filter(type == "ps")

ps_col <- ggplot(ps_median_bar, aes(x = field, y = salary, fill = sex)) +
  geom_col(aes(fill = sex), position = "dodge") +
  scale_y_continuous(expand = c(0,0), limits = c(0,71000), breaks = seq(0,70000, by = 10000)) + 
  labs(y = "Salary (USD)", x = "Field of Study") +
  theme_classic() + 
  scale_fill_manual(name = "Sex",
                    breaks = c("f", "m"),
                    values = c("lightcoral", "steelblue3"),
                    labels = c("Female", "Male")) +
  theme(axis.text.x = element_text(angle = 70, hjust = 1)) 
ps_col

  1. Exploring academic salaries for professors in U.S. colleges. Explore relationships between variables in the ‘Faculty salary data (2008 - 2009 survey)’ dataset. Develop a model describing faculty salary based on data for faculty sex, rank, years in current position, field, and number of years since doctoral degree was earned. You should make decisions regarding which variables should remain in your final model. Describe the results qualitatively and quantitatively (i.e., don’t just report the statistical results of the model – make sure you describe interesting findings in text). You can also discuss any concerns that you have with the model(s) you present, if any.
# tests on the data

# correlation years since phd and years faculty service
yrs_plot <- faculty %>% 
  ggplot(aes(x = Yrs_Since_PhD, y = Yrs_Faculty_Service)) +
  geom_point(aes(color = Sex), alpha = 0.5)

yrs_plot

#correlation years faculty service vs salary

service_plot <- faculty %>% 
  ggplot(aes(x = Yrs_Faculty_Service, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5)

service_plot

#correlation years faculty service v salary 

salary_yrs_plot <- faculty %>% 
  ggplot(aes(x = Yrs_Since_PhD, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5)

salary_yrs_plot

# change reference levles

# Rank: reference level = profesor
faculty$Rank <- factor(faculty$Rank)
faculty$Rank <- fct_relevel(faculty$Rank, "Prof")

# aggregate model 
salary_lm1 <- lm(Salary ~ Rank + Yrs_Since_PhD + Yrs_Faculty_Service + Sex + Discipline, data = faculty)

vif(salary_lm1)
##                         GVIF Df GVIF^(1/(2*Df))
## Rank                2.013193  2        1.191163
## Yrs_Since_PhD       7.518936  1        2.742068
## Yrs_Faculty_Service 5.923038  1        2.433729
## Sex                 1.030805  1        1.015285
## Discipline          1.064105  1        1.031555
# yrs since phd = 7
# yrs faculty service = 5.9
# everything else less than 4

# summarized model

salary_lm2 <- lm(Salary ~ Rank + Yrs_Faculty_Service + Sex + Discipline, data = faculty)

#vif test on lm2

vif(salary_lm2)
##                         GVIF Df GVIF^(1/(2*Df))
## Rank                1.597441  2        1.124233
## Yrs_Faculty_Service 1.627110  1        1.275582
## Sex                 1.030803  1        1.015284
## Discipline          1.029040  1        1.014416
#summarized model with years since phd instead 

salary_lm3 <- lm(Salary ~ Rank + Yrs_Since_PhD + Sex + Discipline, data = faculty)

vif(salary_lm3)
##                   GVIF Df GVIF^(1/(2*Df))
## Rank          1.987205  2        1.187301
## Yrs_Since_PhD 2.065517  1        1.437191
## Sex           1.028359  1        1.014080
## Discipline    1.055727  1        1.027486
#summarzied model with only rank, discipline and sex

salary_lm4 <- lm(Salary ~ Rank + Sex + Discipline, data = faculty)

# really simple model on just rank and discipline

salary_lm5 <- lm(Salary ~ Rank + Discipline, data = faculty)
plot(salary_lm5)

summary(salary_lm5)
## 
## Call:
## lm(formula = Salary ~ Rank + Discipline, data = faculty)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65990 -14049  -1288  10760  97996 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     119788       1813  66.074  < 2e-16 ***
## RankAssocProf   -34082       3160 -10.786  < 2e-16 ***
## RankAsstProf    -47844       3112 -15.376  < 2e-16 ***
## DisciplineB      13761       2296   5.993 4.65e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22650 on 393 degrees of freedom
## Multiple R-squared:  0.445,  Adjusted R-squared:  0.4407 
## F-statistic:   105 on 3 and 393 DF,  p-value: < 2.2e-16
#AIC on all models 

AIC(salary_lm1)
## [1] 9093.826
AIC(salary_lm2)
## [1] 9096.813
AIC(salary_lm3)
## [1] 9097.22
AIC(salary_lm4)
## [1] 9095.454
#lm1 has a lower AIC, but our judegement tells us lm2 is probably better 

################### Graphs testing for interaction terms ###########################

# graphs for testing for interaction between ranks and years of service/ ranks and years since phd

service_rank <- ggplot(faculty, aes(x = Yrs_Faculty_Service, y = Salary, color = Rank)) +
  geom_point(aes(pch = Sex)) +
  geom_smooth(aes(Yrs_Faculty_Service, Salary, color = Rank), method = lm, se = FALSE)
service_rank

since_rank <- ggplot(faculty, aes(x = Yrs_Since_PhD, y = Salary, color = Rank)) +
  geom_point() + 
  geom_smooth(aes(Yrs_Since_PhD, Salary, color = Rank), method = lm, se = FALSE)
since_rank 

# graph to test interaction between years of service and discipline and years since phd and discipline 

service_discipline <- ggplot(faculty, aes(x = Yrs_Faculty_Service, y = Salary, color = Discipline)) +
  geom_point(aes(pch = Sex)) +
  geom_smooth(aes(Yrs_Faculty_Service, Salary, color = Discipline), method = lm, se = FALSE)
service_discipline 

since_discipline <- ggplot(faculty, aes(x = Yrs_Since_PhD, y = Salary, color = Discipline)) +
  geom_point() +
  geom_smooth(aes(Yrs_Since_PhD, Salary, color = Discipline), method = lm, se = FALSE)
since_discipline

# experimental models to test interaction terms 

# first model testing interaction between years of service and rank 

inttest_lm1 <- lm(Salary ~ Rank + Yrs_Faculty_Service + Sex + Discipline + Yrs_Faculty_Service*Rank, data = faculty)
inttest_lm1
## 
## Call:
## lm(formula = Salary ~ Rank + Yrs_Faculty_Service + Sex + Discipline + 
##     Yrs_Faculty_Service * Rank, data = faculty)
## 
## Coefficients:
##                       (Intercept)                      RankAssocProf  
##                         116702.47                          -31081.25  
##                      RankAsstProf                Yrs_Faculty_Service  
##                         -50761.59                             -52.36  
##                           SexMale                        DisciplineB  
##                           4748.70                           13471.42  
## RankAssocProf:Yrs_Faculty_Service   RankAsstProf:Yrs_Faculty_Service  
##                           -261.37                             987.92
plot(inttest_lm1)

summary(inttest_lm1)
## 
## Call:
## lm(formula = Salary ~ Rank + Yrs_Faculty_Service + Sex + Discipline + 
##     Yrs_Faculty_Service * Rank, data = faculty)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65007 -14267  -1367  10853  98612 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       116702.47    4766.08  24.486  < 2e-16
## RankAssocProf                     -31081.25    5380.15  -5.777 1.56e-08
## RankAsstProf                      -50761.59    6062.22  -8.373 1.02e-15
## Yrs_Faculty_Service                  -52.36     121.56  -0.431    0.667
## SexMale                             4748.70    3886.29   1.222    0.222
## DisciplineB                        13471.42    2318.39   5.811 1.30e-08
## RankAssocProf:Yrs_Faculty_Service   -261.37     307.50  -0.850    0.396
## RankAsstProf:Yrs_Faculty_Service     987.92    1871.21   0.528    0.598
##                                      
## (Intercept)                       ***
## RankAssocProf                     ***
## RankAsstProf                      ***
## Yrs_Faculty_Service                  
## SexMale                              
## DisciplineB                       ***
## RankAssocProf:Yrs_Faculty_Service    
## RankAsstProf:Yrs_Faculty_Service     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22680 on 389 degrees of freedom
## Multiple R-squared:  0.4492, Adjusted R-squared:  0.4393 
## F-statistic: 45.33 on 7 and 389 DF,  p-value: < 2.2e-16
vif(inttest_lm1)
##                               GVIF Df GVIF^(1/(2*Df))
## Rank                     10.580134  2        1.803528
## Yrs_Faculty_Service       1.924333  1        1.387203
## Sex                       1.032633  1        1.016186
## Discipline                1.029042  1        1.014417
## Rank:Yrs_Faculty_Service  8.014718  2        1.682566
# second model testing interaction bewtween years since phd and discipline 

inttest_lm2 <- lm(Salary ~ Rank + Yrs_Since_PhD + Sex + Discipline + Yrs_Since_PhD*Discipline, data = faculty)
inttest_lm2
## 
## Call:
## lm(formula = Salary ~ Rank + Yrs_Since_PhD + Sex + Discipline + 
##     Yrs_Since_PhD * Discipline, data = faculty)
## 
## Coefficients:
##               (Intercept)              RankAssocProf  
##                115081.390                 -32767.742  
##              RankAsstProf              Yrs_Since_PhD  
##                -45684.696                      8.332  
##                   SexMale                DisciplineB  
##                  4464.127                  11286.218  
## Yrs_Since_PhD:DisciplineB  
##                   117.789
plot(inttest_lm2)

summary(inttest_lm2)
## 
## Call:
## lm(formula = Salary ~ Rank + Yrs_Since_PhD + Sex + Discipline + 
##     Yrs_Since_PhD * Discipline, data = faculty)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69074 -14118  -1386  10575  95921 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               115081.390   5664.698  20.316   <2e-16 ***
## RankAssocProf             -32767.742   3555.818  -9.215   <2e-16 ***
## RankAsstProf              -45684.696   4277.531 -10.680   <2e-16 ***
## Yrs_Since_PhD                  8.332    151.148   0.055   0.9561    
## SexMale                     4464.127   3882.387   1.150   0.2509    
## DisciplineB                11286.218   4739.175   2.381   0.0177 *  
## Yrs_Since_PhD:DisciplineB    117.789    182.886   0.644   0.5199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22680 on 390 degrees of freedom
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4393 
## F-statistic: 52.71 on 6 and 390 DF,  p-value: < 2.2e-16
# FINAL MODELS:
# first model only include years of faculty service, sex, discipline
salary_lm6 <- lm(Salary ~ Yrs_Faculty_Service + Sex + Discipline, data = faculty)
plot(salary_lm6)

vif(salary_lm6)
## Yrs_Faculty_Service                 Sex          Discipline 
##            1.053649            1.025117            1.028760
summary(salary_lm6)
## 
## Call:
## lm(formula = Salary ~ Yrs_Faculty_Service + Sex + Discipline, 
##     data = faculty)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77424 -19404  -4635  15539 105391 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          84361.1     4941.4  17.072  < 2e-16 ***
## Yrs_Faculty_Service    832.2      110.2   7.550 3.07e-13 ***
## SexMale               8423.3     4744.5   1.775   0.0766 .  
## DisciplineB          13033.8     2840.3   4.589 6.01e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27790 on 393 degrees of freedom
## Multiple R-squared:  0.1646, Adjusted R-squared:  0.1582 
## F-statistic: 25.81 on 3 and 393 DF,  p-value: 2.928e-15
# second model only include rank, sex, discipline
salary_lm7 <- lm(Salary ~ Rank + Sex + Discipline, data = faculty)
plot(salary_lm7)

vif(salary_lm7)
##                GVIF Df GVIF^(1/(2*Df))
## Rank       1.034437  2        1.008500
## Sex        1.022339  1        1.011108
## Discipline 1.012236  1        1.006099
summary(salary_lm7)
## 
## Call:
## lm(formula = Salary ~ Rank + Sex + Discipline, data = faculty)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66268 -14127  -1566  10813  97718 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     115627       4009  28.841  < 2e-16 ***
## RankAssocProf   -33680       3177 -10.600  < 2e-16 ***
## RankAsstProf    -47403       3133 -15.130  < 2e-16 ***
## SexMale           4492       3860   1.164    0.245    
## DisciplineB      13709       2295   5.972 5.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22640 on 392 degrees of freedom
## Multiple R-squared:  0.4469, Adjusted R-squared:  0.4412 
## F-statistic: 79.18 on 4 and 392 DF,  p-value: < 2.2e-16

Stargazer tables for #4 Results:

salary_table <- stargazer(salary_lm6, salary_lm7, type="html")
Dependent variable:
Salary
(1) (2)
Yrs_Faculty_Service 832.154***
(110.215)
RankAssocProf -33,679.900***
(3,177.338)
RankAsstProf -47,403.320***
(3,133.108)
SexMale 8,423.335* 4,491.801
(4,744.537) (3,860.240)
DisciplineB 13,033.850*** 13,708.690***
(2,840.349) (2,295.437)
Constant 84,361.070*** 115,626.800***
(4,941.371) (4,009.133)
Observations 397 397
R2 0.165 0.447
Adjusted R2 0.158 0.441
Residual Std. Error 27,789.810 (df = 393) 22,640.990 (df = 392)
F Statistic 25.810*** (df = 3; 393) 79.180*** (df = 4; 392)
Note: p<0.1; p<0.05; p<0.01
salary_table
[1] “”
[2] “ "
[3] “ "
[4] “ "
[5] “ "
[6] “ "
[7] “ "
[8] “ "
[9] “ "
[10] “ "
[11] “ "
[12] “ "
[13] “ "
[14] “ "
[15] “ "
[16] “ "
[17] “ "
[18] “ "
[19] “ "
[20] “ "
[21] “ "
[22] “ "
[23] “ "
[24] “ "
[25] “ "
[26] “ "
[27] “ "
[28] “ "
[29] “ " [30] “
Dependent variable:
Salary
(1) (2)
Yrs_Faculty_Service 832.154***
(110.215)
RankAssocProf -33,679.900***
(3,177.338)
RankAsstProf -47,403.320***
(3,133.108)
SexMale 8,423.335* 4,491.801
(4,744.537) (3,860.240)
DisciplineB 13,033.850*** 13,708.690***
(2,840.349) (2,295.437)
Constant 84,361.070*** 115,626.800***
(4,941.371) (4,009.133)
Observations 397 397
R2 0.165 0.447
Adjusted R2 0.158 0.441
Residual Std. Error 27,789.810 (df = 393) 22,640.990 (df = 392)
F Statistic 25.810*** (df = 3; 393) 79.180*** (df = 4; 392)
Note: p<0.1; p<0.05; p<0.01

"

Finalized graphs for #4

# finalized graph for interaction between years of service and rank

service_rank_final <- ggplot(faculty, aes(x = Yrs_Faculty_Service, y = Salary, color = Rank)) +
  geom_point(aes(fill = Rank)) +
  geom_smooth(aes(Yrs_Faculty_Service, Salary, color = Rank), method = lm, se = FALSE) +
  scale_y_continuous(expand = c(0,0), limits = c(50000,300000)) +
  scale_x_continuous(expand = c(0,0), limits = c(-0.5,61), breaks = seq(0,60,by = 10)) +
  theme_classic() +
  labs(x = "Years of Faculty Service", y = "Salary (US Dollars)") + 
  scale_color_manual(breaks = c("AsstProf", "AssocProf", "Prof"),
                     values = c( "dodgerblue3","darkgoldenrod2", "tomato1"),
                     labels = c("Assistant Professor", "Associate Professor", "Professor")) +
  guides(fill = FALSE)
service_rank_final